{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n",
    "\n",
    "# 5 - Glue Catalog\n",
    "\n",
    "[awswrangler](https://github.com/aws/aws-sdk-pandas) makes heavy use of [Glue Catalog](https://aws.amazon.com/glue/) to store metadata of tables and connections."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "import awswrangler as wr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Enter your bucket name:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " ············\n"
     ]
    }
   ],
   "source": [
    "import getpass\n",
    "\n",
    "bucket = getpass.getpass()\n",
    "path = f\"s3://{bucket}/data/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Creating a Pandas DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>price</th>\n",
       "      <th>in_stock</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>shoes</td>\n",
       "      <td>50.3</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>tshirt</td>\n",
       "      <td>10.5</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>ball</td>\n",
       "      <td>20.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id    name  price  in_stock\n",
       "0   1   shoes   50.3      True\n",
       "1   2  tshirt   10.5      True\n",
       "2   3    ball   20.0     False"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    {\"id\": [1, 2, 3], \"name\": [\"shoes\", \"tshirt\", \"ball\"], \"price\": [50.3, 10.5, 20.0], \"in_stock\": [True, True, False]}\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Checking Glue Catalog Databases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            Database                                   Description\n",
      "0  aws_sdk_pandas  AWS SDK for pandas Test Arena - Glue Database\n",
      "1            default                         Default Hive database\n"
     ]
    }
   ],
   "source": [
    "databases = wr.catalog.databases()\n",
    "print(databases)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create the database awswrangler_test if not exists"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            Database                                   Description\n",
      "0  aws_sdk_pandas  AWS SDK for pandas Test Arena - Glue Database\n",
      "1   awswrangler_test                                              \n",
      "2            default                         Default Hive database\n"
     ]
    }
   ],
   "source": [
    "if \"awswrangler_test\" not in databases.values:\n",
    "    wr.catalog.create_database(\"awswrangler_test\")\n",
    "    print(wr.catalog.databases())\n",
    "else:\n",
    "    print(\"Database awswrangler_test already exists\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Checking the empty database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Database</th>\n",
       "      <th>Table</th>\n",
       "      <th>Description</th>\n",
       "      <th>Columns</th>\n",
       "      <th>Partitions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Database, Table, Description, Columns, Partitions]\n",
       "Index: []"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wr.catalog.tables(database=\"awswrangler_test\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Writing DataFrames to Data Lake (S3 + Parquet + Glue Catalog)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "desc = \"This is my product table.\"\n",
    "\n",
    "param = {\"source\": \"Product Web Service\", \"class\": \"e-commerce\"}\n",
    "\n",
    "comments = {\n",
    "    \"id\": \"Unique product ID.\",\n",
    "    \"name\": \"Product name\",\n",
    "    \"price\": \"Product price (dollar)\",\n",
    "    \"in_stock\": \"Is this product availaible in the stock?\",\n",
    "}\n",
    "\n",
    "res = wr.s3.to_parquet(\n",
    "    df=df,\n",
    "    path=f\"s3://{bucket}/products/\",\n",
    "    dataset=True,\n",
    "    database=\"awswrangler_test\",\n",
    "    table=\"products\",\n",
    "    mode=\"overwrite\",\n",
    "    glue_table_settings=wr.typing.GlueTableSettings(description=desc, parameters=param, columns_comments=comments),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checking Glue Catalog (AWS Console)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Glue Console](_static/glue_catalog_table_products.png \"Glue Console\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Looking Up for the new table!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Database</th>\n",
       "      <th>Table</th>\n",
       "      <th>Description</th>\n",
       "      <th>Columns</th>\n",
       "      <th>Partitions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>awswrangler_test</td>\n",
       "      <td>products</td>\n",
       "      <td>This is my product table.</td>\n",
       "      <td>id, name, price, in_stock</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Database     Table                Description  \\\n",
       "0  awswrangler_test  products  This is my product table.   \n",
       "\n",
       "                     Columns Partitions  \n",
       "0  id, name, price, in_stock             "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wr.catalog.tables(name_contains=\"roduc\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Database</th>\n",
       "      <th>Table</th>\n",
       "      <th>Description</th>\n",
       "      <th>Columns</th>\n",
       "      <th>Partitions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>awswrangler_test</td>\n",
       "      <td>products</td>\n",
       "      <td>This is my product table.</td>\n",
       "      <td>id, name, price, in_stock</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Database     Table                Description  \\\n",
       "0  awswrangler_test  products  This is my product table.   \n",
       "\n",
       "                     Columns Partitions  \n",
       "0  id, name, price, in_stock             "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wr.catalog.tables(name_prefix=\"pro\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Database</th>\n",
       "      <th>Table</th>\n",
       "      <th>Description</th>\n",
       "      <th>Columns</th>\n",
       "      <th>Partitions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>awswrangler_test</td>\n",
       "      <td>products</td>\n",
       "      <td>This is my product table.</td>\n",
       "      <td>id, name, price, in_stock</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Database     Table                Description  \\\n",
       "0  awswrangler_test  products  This is my product table.   \n",
       "\n",
       "                     Columns Partitions  \n",
       "0  id, name, price, in_stock             "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wr.catalog.tables(name_suffix=\"ts\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Database</th>\n",
       "      <th>Table</th>\n",
       "      <th>Description</th>\n",
       "      <th>Columns</th>\n",
       "      <th>Partitions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>awswrangler_test</td>\n",
       "      <td>products</td>\n",
       "      <td>This is my product table.</td>\n",
       "      <td>id, name, price, in_stock</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Database     Table                Description  \\\n",
       "0  awswrangler_test  products  This is my product table.   \n",
       "\n",
       "                     Columns Partitions  \n",
       "0  id, name, price, in_stock             "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wr.catalog.tables(search_text=\"This is my\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Getting tables details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Column Name</th>\n",
       "      <th>Type</th>\n",
       "      <th>Partition</th>\n",
       "      <th>Comment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>id</td>\n",
       "      <td>bigint</td>\n",
       "      <td>False</td>\n",
       "      <td>Unique product ID.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>name</td>\n",
       "      <td>string</td>\n",
       "      <td>False</td>\n",
       "      <td>Product name</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>price</td>\n",
       "      <td>double</td>\n",
       "      <td>False</td>\n",
       "      <td>Product price (dollar)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>in_stock</td>\n",
       "      <td>boolean</td>\n",
       "      <td>False</td>\n",
       "      <td>Is this product availaible in the stock?</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Column Name     Type  Partition                                   Comment\n",
       "0          id   bigint      False                        Unique product ID.\n",
       "1        name   string      False                              Product name\n",
       "2       price   double      False                    Product price (dollar)\n",
       "3    in_stock  boolean      False  Is this product availaible in the stock?"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wr.catalog.table(database=\"awswrangler_test\", table=\"products\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleaning Up the Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "for table in wr.catalog.get_tables(database=\"awswrangler_test\"):\n",
    "    wr.catalog.delete_table_if_exists(database=\"awswrangler_test\", table=table[\"Name\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Delete Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "wr.catalog.delete_database(\"awswrangler_test\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13 (main, Aug  2 2022, 15:07:42) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
  },
  "vscode": {
   "interpreter": {
    "hash": "bd595004b250e5f4145a0d632609b0d8f97d1ccd278d58fafd6840c0467021f9"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
